rm(list=ls())
library(tidyverse)
library(haven)
library(lubridate)

# Create Mayor Characteristics Data -------

cand_elec_files <- list.files(here::here("data","raw","TSE"),
                             pattern = "candidate_chars",
                             full.names = T)

# select only 2004, 2008 and 2012 elections
cand_elec_files <- cand_elec_files[3:5]

cand_elec_data <- map_df(cand_elec_files,read_rds)

# Keep only mayors

mayor_chars <- cand_elec_data %>% 
  filter(descricao_cargo == "PREFEITO") %>% 
  mutate(sigla_partido = str_replace_all(sigla_partido," ",""),
         sigla_partido = str_to_upper(sigla_partido),
         descricao_eleicao = str_to_upper(stringi::stri_trans_general(descricao_eleicao, "latin-ascii")))

# Create new variables
mayor_chars <- mayor_chars %>% 
  mutate(year = ano_eleicao,
         dob = str_replace_all(data_nascimento,"/|\\s",""),
         dob = ifelse(nchar(dob) == 8, dob, NA),
         dob = as_date(dob,format = "%d%m%Y"),
         age = ifelse(ano_eleicao == 2004,
                      as.integer(interval(start = dob, end = "2005-01-01")/years(1)),
                      ifelse(ano_eleicao == 2008,
                             as.integer(interval(start = dob, end = "2009-01-01")/years(1)),
                             ifelse(ano_eleicao == 2012,
                                    as.integer(interval(start = dob, end = "2013-01-01")/years(1)),
                                    NA)))) %>%
  select(year,descricao_eleicao,sigla_uf, sigla_ue,nome_municipio = descricao_ue,
         nome_candidato,sequencial_candidato,sigla_partido,
         dob,
         age,
         sex = descricao_sexo,
         education = descricao_grau_instrucao,
         marital_status = descricao_estado_civil)

# Prepare/format some variables
mayor_chars <- mayor_chars %>% 
  mutate(education = ifelse(education %in% c("1","3","9","NAO DIVULGAVEL","NAO INFORMADO"),
                       NA,
                       ifelse(education %in% c("ANALFABETO"),"ILITERATE",
                              ifelse(education %in% c("1º GRAU COMPLETO","	1º GRAU INCOMPLETO",
                                                 "ENSINO FUNDAMENTAL COMPLETO",
                                                 "ENSINO FUNDAMENTAL INCOMPLETO",
                                                 "FUNDAMENTAL COMPLETO",
                                                 "FUNDAMENTAL INCOMPLETO"),
                                     "Less than High School",
                                     ifelse(education %in% c("SUPERIOR COMPLETO","SUPERIOR INCOMPLETO"),
                                            "More than High School",
                                            "Some High School")))),
         marital_status = ifelse(marital_status == "CASADO(A)",
                                 "MARRIED",
                                 ifelse(marital_status == "SOLTEIRO(A)",
                                        "SINGLE",
                                        ifelse(marital_status %in% c("SEPARADO(A) JUDICIALMENTE","DIVORCIADO(A)"),
                                               "DIVORCED",
                                               ifelse(marital_status == "VIUVO(A)",
                                                      "WIDOW",
                                                      NA)))),
         sex = ifelse(sex == "MASCULINO",
                      "MALE",
                      ifelse(sex == "FEMININO",
                             "FEMALE",
                             NA)),
         age = ifelse(age < 21, NA,age) # Mayors are only eligle if 21 years old when sworwn in
         )

# Eliminate duplicates (due to second round)
mayor_chars <- mayor_chars %>% 
  distinct()

mayor_chars %>% 
  group_by(sigla_ue,sigla_uf,year,sigla_partido,nome_candidato,sequencial_candidato) %>% 
  mutate(n_obs = n()) %>% 
  filter(n_obs != 1) # OK zero duplicates

# Read Self Reported Wealth Data -----
self_reported_wealth <- read_rds(here::here("data","processed","elections","self_reported_wealth.rds"))

# Join candidate chars and wealth
mayor_chars_wealth <- mayor_chars %>% 
  left_join(self_reported_wealth, by = c("descricao_eleicao","year"="ano_eleicao","sigla_uf","sequencial_candidato"="sq_candidato")) 

#check join
mayor_chars_wealth %>% filter(year != 2004) %>%  naniar::miss_var_summary() # around 9% NAs

# Read Campaign Finance Data -------

# Candidate Data -----
campaign_finance_cand <- read_rds(here::here("data","processed","elections","campaign_finance_cand_mayors.rds"))

# Check number of municipalities per year
campaign_finance_cand %>% 
  distinct(sigla_ue,sigla_uf,year) %>% 
  group_by(year) %>% 
  summarise(n_obs = n())

# Check if there is party duplicated in the same year
campaign_finance_cand %>%
  group_by(sigla_ue,sigla_uf,year,sigla_partido, nome_candidato, sq_candidato) %>% 
  summarise(n_obs = n()) %>% 
  filter(n_obs != 1)

# In the campaign finance data, candidate id is missing for 2004. Then, do joinings separate
campaign_finance_cand_2004 <- campaign_finance_cand %>% 
  filter(year == 2004)

# Confirm that all candidate id = NA
campaign_finance_cand_2004 %>% distinct(sq_candidato) # OK correct

campaign_finance_cand_2008_2012 <- campaign_finance_cand %>% 
  filter(year != 2004)

# Check if there are any candidate id = NA

campaign_finance_cand_2008_2012 %>% filter(is.na(sq_candidato)) # OK zero NAs

# Join candidate data 2004

mayor_chars_wealth_finance_cand_2004 <- mayor_chars_wealth %>% 
  filter(year == 2004) %>% 
  left_join(campaign_finance_cand_2004, by = c("year","sigla_uf","sigla_ue","nome_candidato"))

# Check matching quality using nome_municipio and sigla partido

mayor_chars_wealth_finance_cand_2004 %>% filter(nome_municipio.x != nome_municipio.y) # OK. Same city, different spelling

# The observation below is duplicated in the candidate characteristics raw data. Online search confirm is from PMDB. Exclude wrong match
dup <- mayor_chars_wealth_finance_cand_2004 %>% filter(sigla_partido.x != sigla_partido.y)

mayor_chars_wealth_finance_cand_2004 <- mayor_chars_wealth_finance_cand_2004 %>% 
  anti_join(dup, by = c("sigla_ue","year","sigla_uf","nome_candidato","sigla_partido.x")) %>% 
  rename(sigla_partido = sigla_partido.x, nome_municipio = nome_municipio.x) %>% 
  select(-sigla_partido.y,-nome_municipio.y)

# Join candidate data 2008 and 2012
mayor_chars_wealth_finance_cand_2008_2012 <- mayor_chars_wealth %>% 
  filter(year != 2004) %>% 
  left_join(campaign_finance_cand_2008_2012, by = c("year","sigla_uf","sigla_ue","sequencial_candidato"="sq_candidato"))

# Check matching quality using nome_municipio and sigla partido

#OK Same city, different spelling
mayor_chars_wealth_finance_cand_2008_2012 %>% filter(nome_municipio.x != nome_municipio.y) %>% distinct(nome_municipio.x,nome_municipio.y)

#OK Same names, different spelling
mayor_chars_wealth_finance_cand_2008_2012 %>% filter(nome_candidato.x != nome_candidato.y)

# OK party name in sigla_partido.x instead of acronym
mayor_chars_wealth_finance_cand_2008_2012 %>% filter(sigla_partido.x != sigla_partido.y)

mayor_chars_wealth_finance_cand_2008_2012 <- mayor_chars_wealth_finance_cand_2008_2012 %>% 
  rename(nome_municipio = nome_municipio.x,
         nome_candidato = nome_candidato.x,
         sigla_partido = sigla_partido.y) %>% 
  select(-nome_municipio.y,-nome_candidato.y,-sigla_partido.x)

# Bind years together
mayor_chars_wealth_finance_cand <- bind_rows(mayor_chars_wealth_finance_cand_2004,mayor_chars_wealth_finance_cand_2008_2012)
rm(mayor_chars_wealth_finance_cand_2004,mayor_chars_wealth_finance_cand_2008_2012)

# Committee Data ----
campaign_finance_com <- read_rds(here::here("data","processed","elections","campaign_finance_com_mayors.rds"))

# Check number of municipalities per year
campaign_finance_com %>% 
  distinct(sigla_ue,sigla_uf,year) %>% 
  group_by(year) %>% 
  summarise(n_obs = n())

# Check if there is party duplicated in the same year
campaign_finance_com %>%
  group_by(sigla_ue,sigla_uf,year,sigla_partido) %>% 
  summarise(n_obs = n()) %>% 
  filter(n_obs != 1) 

# Join candidates and committee datasets
mayor_chars_wealth_finance_cand_com <- mayor_chars_wealth_finance_cand %>% 
  left_join(campaign_finance_com, by = c("year","sigla_uf","sigla_ue","sigla_partido"))

# Check matching quality using nome_municipio and sigla partido
# OK Same cities (53), different spelling
mayor_chars_wealth_finance_cand_com %>%
  filter(nome_municipio.x != nome_municipio.y) %>% 
  distinct(nome_municipio.x,nome_municipio.y)

mayor_chars_wealth_finance_cand_com <- mayor_chars_wealth_finance_cand_com %>% 
  rename(nome_municipio = nome_municipio.x) %>% 
  select(-nome_municipio.y)

# Check if there is party duplicated in the same year
mayor_chars_wealth_finance_cand_com %>% 
  group_by(sigla_ue,sigla_uf,year,sigla_partido,sequencial_candidato) %>% 
  summarise(n_obs = n()) %>% 
  filter(n_obs != 1)

# Insert IBGE Codes ---------
tse_to_ibge <- read_delim(here::here("data","raw","correspondencia_municipios_tse_ibge.csv"),
                          skip =1,
                          delim = ";",
                          col_names=c("uf","tse_name","tse_code","mun_code","mun_name"),
                          col_types = "ccccc",
                          locale = locale(encodin="latin1")) %>% 
  mutate(mun_code = as.character(str_sub(mun_code,1,-2)),
         tse_code = as.numeric(tse_code))

# Check correspondence
mayor_chars_wealth_finance_cand_com %>% 
  mutate(codigo_municipio = as.numeric(sigla_ue)) %>%
  anti_join(tse_to_ibge, by = c("codigo_municipio" = "tse_code")) %>% 
  distinct(codigo_municipio)

# Join IBGE codes

mayor_chars_wealth_finance_cand_com <- mayor_chars_wealth_finance_cand_com %>%
  mutate(codigo_municipio = as.numeric(sigla_ue)) %>%
  left_join(tse_to_ibge, by = c("codigo_municipio" = "tse_code", "sigla_uf" = "uf")) %>% 
  select(-tse_name,-mun_name,-codigo_municipio,-sq_candidato) %>% 
  relocate(year,mun_code)

# Deflate monetary variables
gdp_deflator <- read_rds(here::here("data","processed","gdp_deflator.rds")) 

mayor_chars_wealth_finance_cand_com <- mayor_chars_wealth_finance_cand_com %>% 
  left_join(gdp_deflator, by = "year")

mayor_chars_wealth_finance_cand_com <- mayor_chars_wealth_finance_cand_com %>% 
  mutate_at(vars(starts_with("tot"),self_reported_wealth),
            .funs = funs(./gdp_deflator)) 

mayor_chars_wealth_finance_cand_com <- mayor_chars_wealth_finance_cand_com %>% 
  select(-gdp_deflator)

# Save
write_dta(mayor_chars_wealth_finance_cand_com,here::here("data","processed","candidates_chars.dta"))
  

